# Clustering with K-means on a subset to determine an optimal k distortion_from_centroid = [] silhouette = [] clusters = range(1, 20) # 1 to 234 was tested for k in clusters: model = KMeans(n_clusters = k, init ='k-means++', n_init = 10, max_iter = 300, verbose = False, tol = 0.01, random_state = 42, algorithm = 'auto') model.fit(attempt_two) distortion_from_centroid.append(model.inertia_) if k>1: labels = model.labels_ silhouette.append(silhouette_score(attempt_two, labels, metric = 'euclidean'))# Plotting elbow curve Elbow_curve = plt.figure(figsize = (5,5)) plt.plot(clusters, distortion_from_centroid, 'bx-') plt.xlabel('k') plt.ylabel('Distortion from centroid') plt.title('The optimal value of k') plt.ylim(0, 4000) plt.xlim(0,20) plt.scatter(14, 1850, facecolors = 'none', edgecolors = 'g', s = 1000, alpha = 0.4) plt.show()# Plotting silhouette scores silhouette_score_plot = plt.figure(figsize = (5,5)) plt.plot(range(2,20), silhouette, 'rx-') plt.xlabel('k') plt.ylabel('Silhouette scores') plt.title('The optimal value of k') #plt.ylim(2000, 12000) plt.xlim(0,20) plt.scatter(14,0.58, facecolors = 'none', edgecolors = 'g', s = 700) plt.show()# let k = 13 model = KMeans(n_clusters = 14, init ='k-means++', n_init = 10, max_iter = 300, verbose = False, tol = 0.01, random_state = 42, algorithm = 'auto') # fit model km_preds = model.fit_predict(attempt_two) attempt_two['cluster label'] = km_preds attempt_two['cluster label'].astype(int) pd.DataFrame((attempt_two['cluster label'].value_counts()))attempt_two.head()
# Repeating clustering with a different alogrythm ## https://www.science.org/doi/10.1126/science.1136800 cluster_check = attempt_two.iloc[:, 0:5].copy() db = model = DBSCAN(eps = 0.015, min_samples = 15, metric = 'cosine', 3 algorithm = 'auto') db_labels = db.fit_predict(cluster_check) db_labels# plot the two against eachother and hopefully see some kind of straight line comparing_points = pd.DataFrame(db_labels) comparing_points['km_labels'] = km_preds comparing_pointsplt.figure plt.scatter(comparing_points['km_labels'], comparing_points[0] )